< Case Study

R Markdown

#Data analysis on craft beers and breweries accross the United States. #Project is created with: R 4.1.2 #Packages: caret, class, gganimate, ggrepel, ggthemes, gifski, kableExtra, maps, tidyverse, tigris, viridis

#Question 2 & 3
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
##     filter, lag
## The following objects are masked from 'package:base':
##
##     intersect, setdiff, setequal, union
library(tidyr)
library(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.1 ──
## ✓ ggplot2 3.3.5     ✓ purrr   0.3.4
## ✓ tibble  3.1.6     ✓ stringr 1.4.0
## ✓ readr   2.1.1     ✓ forcats 0.5.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()
#reading in the csvs

brewery = read.csv("./Data Files/Breweries.csv") #read breweries.csv
beer = read.csv("./Data Files/Beers.csv") #read beers.csv

#renaming columns for the join to the beer dataset
brewery=rename(brewery, Brewery_id=Brew_ID, Brewery=Name)
brewery$State <- trimws(brewery$State, which = c("left"))#added to remove space before state

#pulling ids with inaccurate info
Brewery_id<-as.integer(c(96,415,167, 262, 139))
error_brew_ids<-as.data.frame(Brewery_id)
#joining to get Brewery Names
brew_filter = merge(x=error_brew_ids,y=brewery,by="Brewery_id") %>% select(Brewery_id, Brewery)

#joining on brewery name to pull in correct columns--filter out inaccurate
brew_filter = merge(x=brewery,y=brew_filter,by="Brewery") %>% mutate(type = case_when( Brewery_id.x==Brewery_id.y ~ "correct",TRUE~  "wrong"))
#filtering down to rows with errors
brew_replace =brew_filter %>%filter(type == "wrong") %>% select(Brewery_id.y, Brewery, City, State)%>% rename(Brewery_id=Brewery_id.y)

#removing errors and replacing with updated df
brewery = anti_join(x=brewery,y=error_brew_ids,by="Brewery_id")
brewery<-rbind(brewery, brew_replace)
brewery$City = str_replace(brewery$City, "Menominie", "Menomonie")

#join between brewery and beer data
beer_brewery = merge(x=brewery,y=beer,by="Brewery_id")

#print first 6 rows
head(beer_brewery)
##   Brewery_id            Brewery        City State          Name Beer_ID   ABV
## 1          1 NorthGate Brewing  Minneapolis    MN       Pumpion    2689 0.060
## 2          1 NorthGate Brewing  Minneapolis    MN    Stronghold    2688 0.060
## 3          1 NorthGate Brewing  Minneapolis    MN   Parapet ESB    2687 0.056
## 4          1 NorthGate Brewing  Minneapolis    MN  Get Together    2692 0.045
## 5          1 NorthGate Brewing  Minneapolis    MN Maggie's Leap    2691 0.049
## 6          1 NorthGate Brewing  Minneapolis    MN    Wall's End    2690 0.048
##   IBU                               Style Ounces
## 1  38                         Pumpkin Ale     16
## 2  25                     American Porter     16
## 3  47 Extra Special / Strong Bitter (ESB)     16
## 4  50                        American IPA     16
## 5  26                  Milk / Sweet Stout     16
## 6  19                   English Brown Ale     16
#print last 6 rows
tail(beer_brewery)
##      Brewery_id                       Brewery          City State
## 2405        556         Ukiah Brewing Company         Ukiah    CA
## 2406        557       Butternuts Beer and Ale Garrattsville    NY
## 2407        557       Butternuts Beer and Ale Garrattsville    NY
## 2408        557       Butternuts Beer and Ale Garrattsville    NY
## 2409        557       Butternuts Beer and Ale Garrattsville    NY
## 2410        558 Sleeping Lady Brewing Company     Anchorage    AK
##                           Name Beer_ID   ABV IBU                   Style Ounces
## 2405             Pilsner Ukiah      98 0.055  NA         German Pilsener     12
## 2406         Porkslap Pale Ale      49 0.043  NA American Pale Ale (APA)     12
## 2407           Snapperhead IPA      51 0.068  NA            American IPA     12
## 2408         Moo Thunder Stout      50 0.049  NA      Milk / Sweet Stout     12
## 2409  Heinnieweisse Weissebier      52 0.049  NA              Hefeweizen     12
## 2410 Urban Wilderness Pale Ale      30 0.049  NA        English Pale Ale     12
#3.  Address the missing values in each column.
#filtering of NAs for ABV and IBU-- rows drop from 2410 down to 1405 for this filtering
#only use this filtered dataset for the questions related to ABV/IBU#only use this filtered dataset for the questions related to ABV/IBU
clean_beer_brewery <- beer_brewery %>% filter_at(vars(ABV,IBU),all_vars(!is.na(.)))
#question 1
#How many breweries are present in each state? (counties heatmap)
#load libraries
library(ggplot2)
library(maps)
##
## Attaching package: 'maps'
## The following object is masked from 'package:purrr':
##
##     map
#library(tidyverse)#added to 1st chunk
library(ggthemes)
library(kableExtra)
##
## Attaching package: 'kableExtra'
## The following object is masked from 'package:dplyr':
##
##     group_rows
#load counties table
cnty <- read.csv("./Data Files/cty-cnty.csv")
colnames(cnty)[3] <- "region"
#load population data 2021
pop_est <- read.csv("./Data Files/NST-EST2021-alldata.csv")
pop_est <- pop_est %>% select(NAME,POPESTIMATE2021) %>% rename(region=NAME, "Pop2021"=POPESTIMATE2021) %>% mutate(region=tolower(region))
pop_est <- pop_est[-c(1:5),]
pop_est$region %>% str_replace("District of Columbia", "washington, d.c.")
##  [1] "alabama"              "alaska"               "arizona"
##  [4] "arkansas"             "california"           "colorado"
##  [7] "connecticut"          "delaware"             "district of columbia"
## [10] "florida"              "georgia"              "hawaii"
## [13] "idaho"                "illinois"             "indiana"
## [16] "iowa"                 "kansas"               "kentucky"
## [19] "louisiana"            "maine"                "maryland"
## [22] "massachusetts"        "michigan"             "minnesota"
## [25] "mississippi"          "missouri"             "montana"
## [28] "nebraska"             "nevada"               "new hampshire"
## [31] "new jersey"           "new mexico"           "new york"
## [34] "north carolina"       "north dakota"         "ohio"
## [37] "oklahoma"             "oregon"               "pennsylvania"
## [40] "rhode island"         "south carolina"       "south dakota"
## [43] "tennessee"            "texas"                "utah"
## [46] "vermont"              "virginia"             "washington"
## [49] "west virginia"        "wisconsin"            "wyoming"
## [52] "puerto rico"
pop_est$rank <- rank(-pop_est$Pop2021)

#wrangle data
#brewery$State <- trimws(brewery$State, which = c("left"))
cnty <- cnty %>% select("City", "State","County","region")#select columns
cnty <- cnty[!duplicated(cnty),]


#left join brew and cnty
brewcomb <- merge(brewery, select(cnty, c("City", "State", "County","region")), by=c("City","State"))
brewcomb <- brewcomb %>% distinct(Brewery_id, .keep_all = TRUE)

#1 table wtih state count, map with state count and map with county count
#state and county tables
us_states <- map_data("state")
us_counties <- map_data("county")

#state brewery count
brew_state <- brewcomb %>% mutate(region=tolower(region)) %>%
  group_by(region) %>% count(region)
brew_state <- brew_state %>% left_join(pop_est,by="region") %>% arrange(desc(n))
brew_state %>% mutate(region=str_to_title(region),Pop2021=round(Pop2021/1000000,2)) %>% kable(col.names = c("State","Breweries","Population 2021","Pop. Rank")) %>%
  kable_styling(latex_options = c("striped", "scale_down")) %>%
  row_spec(row = 0, italic = T, background = "#21918c", color = "white") %>%
  column_spec(1:2, width = "0.5in")
State Breweries Population 2021 Pop. Rank
Colorado 47 5.81 21
California 39 39.24 1
Michigan 33 10.05 10
Oregon 29 4.25 27
Texas 28 29.53 2
Pennsylvania 25 12.96 5
Washington 23 7.74 13
Indiana 22 6.81 17
Massachusetts 22 6.98 15
Wisconsin 20 5.90 20
North Carolina 19 10.55 9
Illinois 18 12.67 6
New York 16 19.84 4
Virginia 16 8.64 12
Florida 15 21.78 3
Ohio 15 11.78 7
Minnesota 12 5.71 22
Arizona 11 7.28 14
Vermont 10 0.65 51
Maine 9 1.37 43
Missouri 9 6.17 18
Montana 9 1.10 44
Connecticut 8 3.61 29
Alaska 7 0.73 49
Georgia 7 10.80 8
Maryland 7 6.17 19
Oklahoma 6 3.99 28
Idaho 5 1.90 39
Iowa 5 3.19 32
Louisiana 5 4.62 25
Nebraska 5 1.96 38
Rhode Island 5 1.10 45
Hawaii 4 1.44 41
Kentucky 4 4.51 26
New Mexico 4 2.12 37
South Carolina 4 5.19 23
Utah 4 3.34 30
Wyoming 4 0.58 52
Alabama 3 5.04 24
Kansas 3 2.93 36
New Hampshire 3 1.39 42
New Jersey 3 9.27 11
Tennessee 3 6.98 16
Arkansas 2 3.03 34
Delaware 2 1.00 46
Mississippi 2 2.95 35
Nevada 2 3.14 33
North Dakota 1 0.77 48
South Dakota 1 0.90 47
Washington, D.c. 1 NA NA
West Virginia 1 1.78 40
#state gradient map
us_states %>% left_join(brew_state,by=("region")) %>%
  ggplot(aes(x=long,y=lat,group=group,fill=n))+
  geom_polygon(color = "gray90", size=.1)+
  coord_map(projection = "albers", lat0=45, lat1=55)+
  scale_fill_viridis_c()+
  theme(axis.line = element_blank(),
        axis.text = element_blank(),
        axis.ticks = element_blank(),
        panel.background = element_blank(),
        axis.title = element_blank())+
  labs(fill="Brewery\nCount")+
  ggtitle("Breweries by State\nContinental US")

#county brewery count
brew_county <- brewcomb %>% mutate(subregion=tolower(County)) %>%
  group_by(subregion) %>% count(subregion)
#county gradient map
us_counties %>% left_join(brew_county,by="subregion") %>%
  ggplot()+
  geom_polygon(aes(x=long,y=lat,group=group,fill=n),color = "gray90",size=.1)+
  coord_map(projection = "albers",lat0=45,lat1=55)+
  scale_fill_viridis_c()+
  theme(axis.line = element_blank(),
        axis.text = element_blank(),
        axis.ticks = element_blank(),
        panel.background = element_blank(),
        axis.title = element_blank())+
  labs(fill="Brewery\nCount")+
  ggtitle("Breweries by County\nContinental US")+
  geom_polygon(data = us_states,aes(x=long,y=lat,group=group), color = "white",size=.15,fill="transparent")

#The table above shows the breweries present in each state, the top 5 states are Colorado (47), California (39), Michigan (33), Oregion (29), and Texas (28).

#Question 4
#Compute the median alcohol content and international bitterness unit for each state. Plot a bar chart to compare.

#ABV
bb<-clean_beer_brewery %>% group_by(State) %>% summarize(medianABV = median(ABV), count = n()) %>% ggplot(aes(x = reorder(State, -medianABV), y = medianABV, fill=medianABV)) + geom_col()+
  xlab('State') + ylab('Median ABV')+ggtitle("Median ABV by State") +theme(axis.text = element_text(size = 6))+ scale_fill_gradient(low='#46085C', high='#4FBA6E', name='Median ABV')
bb + scale_y_continuous(labels = scales::percent)

#Question 4
#DC and Kentucky have the highest Median ABV. Both states have a high % of brands with the American Double / Imperial IPA, which has #an ABV between 7-14%

#IBU
bb<-clean_beer_brewery %>% group_by(State) %>% summarize(medianABV = median(ABV), medianIBU=round(median(IBU),0), count = n()) %>% ggplot(aes(x = reorder(State, -medianABV), y = medianIBU, fill=medianIBU)) + geom_col()+
  xlab('State') + ylab('Median IBU')+ggtitle("Median IBU by State") +theme(axis.text = element_text(size = 6))+ scale_fill_gradient(low='#46085C', high='#4FBA6E', name='Median IBU')
bb 

#Question 4
#Maine and West Virginia have the highest Median IBU. Maine is known for the IPAs, which indexes higher on IBU.


#combined
bb<-clean_beer_brewery %>% group_by(State) %>% summarize(medianABV = median(ABV), medianIBU=round(median(IBU),0), count = n()) %>% ggplot(aes(x = reorder(State, -medianABV), y = medianABV, fill=medianIBU)) + geom_col()+
  xlab('State') + ylab('Median ABV')+ggtitle("Median IBU/ABV by State") +theme(axis.text = element_text(size = 6))+ scale_fill_gradient(low='#46085C', high='#4FBA6E', name='Median IBU')
bb + scale_y_continuous(labels = scales::percent)

#Question 4
#Delaware and New Hampshire have some of the highest IBU for their ABV. There are some articles from Delaware about "table beer" and #low ABV fruit beers.
#However, both states combined represent <1% of the data of the overall data.
#5 which state has the maximum alcoholic (ABV) beer? which state has the most bitter (IBU) beer?

library(kableExtra)
#maximum ABV by state
maxABV <- beer_brewery %>% merge(cnty, by=c("City", "State")) %>% select(ABV, region) %>%
  filter(!is.na(ABV)) %>% group_by(region) %>% summarise(Max_ABV=max(ABV)) %>% arrange(desc(Max_ABV))
maxABV %>% kable(col.names = c("State", "Max ABV")) %>%
  kable_styling(latex_options = c("striped", "scale_down")) %>%
  row_spec(row = 0, italic = T, background = "#21918c", color = "white") %>%
  column_spec(1:2, width = "0.5in")
State Max ABV
Colorado 0.128
Kentucky 0.125
Indiana 0.120
New York 0.100
California 0.099
Idaho 0.099
Maine 0.099
Massachusetts 0.099
Michigan 0.099
Minnesota 0.099
Nevada 0.099
New Jersey 0.099
North Carolina 0.099
Ohio 0.099
Pennsylvania 0.099
Texas 0.099
Wisconsin 0.099
South Carolina 0.097
Illinois 0.096
Nebraska 0.096
Vermont 0.096
Arizona 0.095
Iowa 0.095
Alabama 0.093
Washington, D.C. 0.092
Connecticut 0.090
Utah 0.090
Louisiana 0.088
Oregon 0.088
Virginia 0.088
Rhode Island 0.086
Kansas 0.085
Maryland 0.085
Oklahoma 0.085
Washington 0.084
Hawaii 0.083
Florida 0.082
Mississippi 0.080
Missouri 0.080
New Mexico 0.080
Montana 0.075
Georgia 0.072
Wyoming 0.072
South Dakota 0.069
Alaska 0.068
North Dakota 0.067
West Virginia 0.067
New Hampshire 0.065
Tennessee 0.062
Arkansas 0.061
Delaware 0.055
#Maximum IBU by state
maxIBU <- beer_brewery %>% merge(cnty, by=c("City", "State")) %>% select(IBU, region) %>%
  filter(!is.na(IBU)) %>% group_by(region) %>% summarise(Max_IBU=max(IBU)) %>% arrange(desc(Max_IBU))
maxIBU %>% kable(col.names = c("State","Max IBU")) %>%
  kable_styling(latex_options = c("striped", "scale_down")) %>%
  row_spec(row = 0, italic = T, background = "#21918c", color = "white") %>%
  column_spec(1:2, width = "0.5in")
State Max IBU
Oregon 138
Virginia 135
Massachusetts 130
Ohio 126
Minnesota 120
Vermont 120
Texas 118
California 115
Indiana 115
Michigan 115
Washington, D.C. 115
Pennsylvania 113
New York 111
Kansas 110
Colorado 104
Alabama 103
Idaho 100
Illinois 100
New Jersey 100
New Mexico 100
Oklahoma 100
Arizona 99
Iowa 99
North Carolina 98
Maryland 90
Nevada 90
Missouri 89
Connecticut 85
Utah 83
Washington 83
Florida 82
New Hampshire 82
Kentucky 80
Mississippi 80
Montana 80
Wisconsin 80
Hawaii 75
Rhode Island 75
Wyoming 75
Alaska 71
West Virginia 71
Maine 70
North Dakota 70
Georgia 65
Nebraska 65
South Carolina 65
Tennessee 61
Louisiana 60
Delaware 52
Arkansas 39
#US map for Max ABV by state
us_states %>% mutate(region=str_to_title(region)) %>% left_join(maxABV,by="region") %>%
  ggplot(aes(x=long,y=lat,group=group, fill = Max_ABV))+
  geom_polygon(color = "gray90", size=.1)+
  coord_map(projection = "albers", lat0=45, lat1=55)+
  scale_fill_viridis_c()+
  theme(axis.line = element_blank(),
        axis.text = element_blank(),
        axis.ticks = element_blank(),
        panel.background = element_blank(),
        axis.title = element_blank())+
  labs(fill="Maximum\nAlcohol by Volume")+
  ggtitle("Maximum ABV by State\nContinental US")

#US map for max IBU by state
us_states %>% mutate(region=str_to_title(region)) %>% left_join(maxIBU,by="region") %>%
  ggplot(aes(x=long,y=lat,group=group, fill = Max_IBU))+
  geom_polygon(color = "gray90", size=.1)+
  coord_map(projection = "albers", lat0=45, lat1=55)+
  scale_fill_viridis_c()+
  theme(axis.line = element_blank(),
        axis.text = element_blank(),
        axis.ticks = element_blank(),
        panel.background = element_blank(),
        axis.title = element_blank())+
  labs(fill="Maximum\nIBU")+
  ggtitle("Maximum IBU by State\nContinental US")

#Colorado is the state with the beer with the maximum alcohol by volume (ABV) with .128. Oregon is the state with the beer with the maximum International Bitterness Units (IBU) with .138.

#Question 6
#6 comment on the summary statistics and distribution of the ABV variable

#Density Curve for ABV
beer_brewery %>% ggplot(aes(x=ABV))+geom_density(color="#39568C")+theme_bw()+theme(
  panel.grid = element_blank(),
  panel.border = element_blank())
## Warning: Removed 62 rows containing non-finite values (stat_density).

#Boxplot for ABV
beer_brewery %>% ggplot(aes(x=ABV))+geom_boxplot()
## Warning: Removed 62 rows containing non-finite values (stat_boxplot).

#5 number summary and mean for ABV
ABVsum <- beer_brewery %>% filter(!is.na(ABV)) %>%
  select(ABV) %>%
  summarise(min=min(ABV),
            "25%"=quantile(ABV,.25),
            median=median(ABV),
            "75%"=quantile(ABV,.75),
            max=max(ABV),mean=mean(ABV))
ABVsum %>% kable(col.names = c("Min", "25%", "Med", "75%", "Max", "Mean")) %>%
  kable_styling(latex_options = c("striped", "scale_down")) %>%
  row_spec(row = 0, italic = T, background = "#21918c", color = "white") %>%
  column_spec(1:2, width = "0.5in")
Min 25% Med 75% Max Mean
0.001 0.05 0.056 0.067 0.128 0.0597734
#right skewed distribution centered at .057, there seems to be some degree of rounding occuring at around .1,
#given that there are several beers that are exactly at the line with fewer slightly below and almost none
#slightly above the number. There also seems to be some degree of rounding at .5, but this is less pronounced

#right skewed distribution centered at .057, there seems to be some degree of rounding occuring at around .1, given that there are several beers that are exactly at the line with fewer slightly below and almost none slightly above the number. There also seems to be some degree of rounding at .5, but this is less pronounced

#Question 7
#7.Is there an apparent relationship between the bitterness of the beer and its alcoholic content? Draw a scatter plot. Make your best judgment of a relationship and EXPLAIN your answer.

#comparison of ABV/IBU in scatter plot with regression line

#regression line
bb.lm <- lm(ABV~ IBU, beer_brewery)
bb = clean_beer_brewery %>% ggplot(aes(x = ABV, y=IBU, color = IBU)) + geom_point(aes(fill=IBU, color=IBU),pch=21,size=3,colour="black")+ ggtitle("International Bitterness Units vs. Alchohol by Volume")+
  geom_smooth(method='lm', formula= y~x, color="black")
bb + scale_x_continuous(labels = scales::percent)

str(summary(bb.lm))
## List of 12
##  $ call         : language lm(formula = ABV ~ IBU, data = beer_brewery)
##  $ terms        :Classes 'terms', 'formula'  language ABV ~ IBU
##   .. ..- attr(*, "variables")= language list(ABV, IBU)
##   .. ..- attr(*, "factors")= int [1:2, 1] 0 1
##   .. .. ..- attr(*, "dimnames")=List of 2
##   .. .. .. ..$ : chr [1:2] "ABV" "IBU"
##   .. .. .. ..$ : chr "IBU"
##   .. ..- attr(*, "term.labels")= chr "IBU"
##   .. ..- attr(*, "order")= int 1
##   .. ..- attr(*, "intercept")= int 1
##   .. ..- attr(*, "response")= int 1
##   .. ..- attr(*, ".Environment")=<environment: R_GlobalEnv>
##   .. ..- attr(*, "predvars")= language list(ABV, IBU)
##   .. ..- attr(*, "dataClasses")= Named chr [1:2] "numeric" "numeric"
##   .. .. ..- attr(*, "names")= chr [1:2] "ABV" "IBU"
##  $ residuals    : Named num [1:1405] 0.00174 0.0063 -0.00542 -0.01747 -0.00505 ...
##   ..- attr(*, "names")= chr [1:1405] "1" "2" "3" "4" ...
##  $ coefficients : num [1:2, 1:4] 4.49e-02 3.51e-04 5.18e-04 1.04e-05 8.68e+01 ...
##   ..- attr(*, "dimnames")=List of 2
##   .. ..$ : chr [1:2] "(Intercept)" "IBU"
##   .. ..$ : chr [1:4] "Estimate" "Std. Error" "t value" "Pr(>|t|)"
##  $ aliased      : Named logi [1:2] FALSE FALSE
##   ..- attr(*, "names")= chr [1:2] "(Intercept)" "IBU"
##  $ sigma        : num 0.0101
##  $ df           : int [1:3] 2 1403 2
##  $ r.squared    : num 0.45
##  $ adj.r.squared: num 0.449
##  $ fstatistic   : Named num [1:3] 1147 1 1403
##   ..- attr(*, "names")= chr [1:3] "value" "numdf" "dendf"
##  $ cov.unscaled : num [1:2, 1:2] 2.64e-03 -4.52e-05 -4.52e-05 1.06e-06
##   ..- attr(*, "dimnames")=List of 2
##   .. ..$ : chr [1:2] "(Intercept)" "IBU"
##   .. ..$ : chr [1:2] "(Intercept)" "IBU"
##  $ na.action    : 'omit' Named int [1:1005] 19 35 36 37 38 39 40 41 42 43 ...
##   ..- attr(*, "names")= chr [1:1005] "19" "35" "36" "37" ...
##  - attr(*, "class")= chr "summary.lm"
#Question 7
#There is evidence of a positive relationship between ABV and IBU, based on the regression line.
#R-Squared has a value of 0.45, meaning ABV explains 45% of the variation in IBU.
#Question 8
library(caret)
## Loading required package: lattice
##
## Attaching package: 'caret'
## The following object is masked from 'package:purrr':
##
##     lift
library(class)

#Budweiser would also like to investigate the difference with respect to IBU and ABV between IPAs (India Pale Ales) and other types of Ale (any beer with “Ale” in its name other than IPA).  You decide to use KNN classification to investigate this relationship.  Provide statistical evidence one way or the other. You can of course assume your audience is comfortable with percentages … KNN is very easy to understand conceptually.

#add ipa, ale, other column
ipa_ale <-
  clean_beer_brewery %>% mutate(Ale_type = ifelse(
    str_detect(clean_beer_brewery$Style, regex("ipa", ignore_case = TRUE)),
    "IPA",
    ifelse(str_detect(
      clean_beer_brewery$Style, regex("ale", ignore_case = TRUE)
    ), "Ale", "other")
  ))
#filter out "other" type of beers and drop levels
ipa_ale <- ipa_ale %>% filter(Ale_type %in% c("IPA", "Ale")) %>% droplevels(ipa_ale$Ale_type)

#for loops for KNN model and hypertuning k parameter
nsplits <- 100
nks <- 30

df <- data.frame()
set.seed(6)
for (i in 1:nsplits) {
  n <- nrow(ipa_ale)
  nsamp <- sample(1:n,round(n*.7)) #for a 70-30 split
  ipa_train <- ipa_ale[nsamp,]
  ipa_test <- ipa_ale[-nsamp,]

  for (j in 1:nks) {
    cm <- knn(scale(ipa_train[,7:8]),scale(ipa_test[,7:8]),cl=ipa_train$Ale_type,k=i) %>% confusionMatrix(as.factor(ipa_test$Ale_type),positive = "IPA")
    cm_values <- data.frame("k"=j,"Accuracy"=cm$overall[1],"Sensitivity"=cm$byClass[1],"Specificity"=cm$byClass[2])
    df <- rbind(df,cm_values)
  }
}

#summarize the k results
means <- df %>% group_by(k) %>% summarise(Accuracy=mean(Accuracy)*100,Sensitivity=mean(Sensitivity)*100,Specificity=mean(Specificity)*100)

#plot accuracy, sensitivity and specificity
means %>% ggplot(aes(x=k,y=Accuracy))+geom_line(color="#39568C")+theme_bw()+theme(
  panel.grid = element_blank(),
  panel.border = element_blank())

means %>% ggplot(aes(x=k,y=Sensitivity))+geom_line(color="#39568C")+theme_bw()+theme(
  panel.grid = element_blank(),
  panel.border = element_blank())

means %>% ggplot(aes(x=k,y=Specificity))+geom_line(color="#39568C")+theme_bw()+theme(
  panel.grid = element_blank(),
  panel.border = element_blank())

#After hypertuning the k parameter, we can see that looking at the 14 closest beers in terms of ABV and IBU would help us make a prediction on whether it’s a Indian Pale Ale or another type of Ale, with about 85% accuracy. #moreover this model is able to properly classify other ales as Ale (Specificity) with almost 86% accuracy, and classify IPA as IPA (sensitivity) with 83.64% accuracy.

#Question 9
#Knock their socks off!  Find one other useful inference from the data that you feel Budweiser may be able to find value in.  You must convince them why it is important and back up your conviction with appropriate statistical evidence.

#join brew/beer and cnty (bubble map data for brews)
library(tidyverse)
library(tigris)
## To enable
## caching of data, set `options(tigris_use_cache = TRUE)` in your R script or .Rprofile.
library(viridis)
## Loading required package: viridisLite
##
## Attaching package: 'viridis'
## The following object is masked from 'package:maps':
##
##     unemp
us_counties <- map_data("county")
cnty<-cnty%>% select(City, State, region, County)
cnty<- cnty %>% distinct(City, State, .keep_all = TRUE)
us_counties<-us_counties %>% distinct(region, subregion, .keep_all = TRUE)

brewcomb_type <- merge(clean_beer_brewery, select(cnty, c("City", "State", "County","region")), by=c("City","State"))
brewcomb_type <- brewcomb_type %>% mutate(subregion=tolower(County),region=tolower(region))

#combining for lat long info
brewcomb_type <- merge(brewcomb_type, select(us_counties, c("lat", "long", "region","subregion")), by=c("region","subregion"))

#style text clean-up
library(stringr)

brewcomb_type$BeerStyle <- str_replace_all(brewcomb_type$Style, c(".*American.*IPA.*"="_American IPA",  ".*American.*Ale.*"= "_American Ale", ".*Stout.*"="_Stout",  ".*Lager.*"="_Lager",  ".*Pilsner.*"= "_Pilsner", ".*Bitter.*"= "_Bitter", ".*Porter.*"= "_Porter", ".*Belgian.*"= "_German",  ".*Bock.*"= "_German",   ".*Czech.*"= "_German", ".*Doppelbock.*"= "_German",  ".*German.*"= "_German",  ".*Gose.*"= "_German",  ".*Grisette.*"= "_German",  ".*Hefeweizen.*"= "_German",  ".*Kölsch.*"= "_German",  ".*Maibock.*"= "_German",  ".*Märzen.*"= "_German",  ".*Munich.*"= "_German",  ".*Saison.*"= "_German",  ".*Vienna.*"= "_German",  ".*Witbier.*"= "_German",  ".*Altbier.*"= "_German", ".*Baltic.*"= "_German",  ".*Berliner.*"= "_German",  ".*Keller.*"= "_German", ".*Schwarzbier.*"= "_German"))
brewcomb_type$BeerStyle<-str_replace(brewcomb_type$BeerStyle,"^(?!_).*$", "")
brewcomb_type$SimpleStyle<-str_replace(brewcomb_type$Style,".*Ale.*", "_Ale")
brewcomb_type$SimpleStyle<-str_replace(brewcomb_type$SimpleStyle,"^(?!_).*$", "")
brewcomb_type$BeerStyle <- paste(brewcomb_type$BeerStyle, brewcomb_type$SimpleStyle)
brewcomb_type$BeerStyle<-str_replace(brewcomb_type$BeerStyle,".*_American Ale _Ale.*", "_American Ale")
brewcomb_type$BeerStyle <- trimws(brewcomb_type$BeerStyle, which = c("left"))#added to remove space before style
brewcomb_type$BeerStyle<-str_replace(brewcomb_type$BeerStyle,"^(?!_).*$", "Other")
brewcomb_type$BeerStyle <- str_replace_all(brewcomb_type$BeerStyle, c(".*_"=" ", ".*German.*Ale.*"="German Ale", ".*American.*IPA.*"="American IPA"))
brewcomb_type$BeerStyle <- trimws(brewcomb_type$BeerStyle, which = c("right"))#added to remove space after style
brewcomb_type$BeerStyle <- trimws(brewcomb_type$BeerStyle, which = c("left"))#added to remove space before style

#summary of beer styles
sum_brew<-brewcomb_type %>% group_by(BeerStyle) %>% summarize(meanabv = mean(ABV), meanIBU = mean(IBU), beers = n()) %>% arrange(desc(beers))
sum_brew
## # A tibble: 10 × 4
##    BeerStyle    meanabv meanIBU beers
##    <chr>          <dbl>   <dbl> <int>
##  1 American Ale  0.0556    37.0   401
##  2 American IPA  0.0694    72.4   360
##  3 German        0.0522    23.9   158
##  4 Ale           0.0609    27.8   127
##  5 Lager         0.0504    23.5    79
##  6 Other         0.0621    27.1    79
##  7 Stout         0.0684    44.8    50
##  8 Porter        0.0631    33.8    40
##  9 Bitter        0.0551    43.8    16
## 10 Pilsner       0.0499    27.4    15
#summary table for powerpoint
sum_brew %>% kable(col.names = c("BeerStyle","meanabv","meanIBU","beers")) %>%
  kable_styling(latex_options = c("striped", "scale_down")) %>%
  row_spec(row = 0, italic = T, background = "#21918c", color = "white") %>%
  column_spec(1:2, width = "0.5in")
BeerStyle meanabv meanIBU beers
American Ale 0.0555636 37.04489 401
American IPA 0.0694222 72.42222 360
German 0.0522342 23.86709 158
Ale 0.0609213 27.82677 127
Lager 0.0503797 23.45570 79
Other 0.0621392 27.12658 79
Stout 0.0683800 44.84000 50
Porter 0.0631250 33.75000 40
Bitter 0.0551250 43.75000 16
Pilsner 0.0499333 27.40000 15
#summary table for bubble graph/map
bb<-brewcomb_type %>% group_by(BeerStyle, lat, long) %>% summarize(meanabv = mean(ABV), meanIBU = mean(IBU), beers = n()) %>% arrange(desc(beers))
## `summarise()` has grouped output by 'BeerStyle', 'lat'. You can override using the `.groups` argument.
bb
## # A tibble: 637 × 6
## # Groups:   BeerStyle, lat [609]
##    BeerStyle      lat   long meanabv meanIBU beers
##    <chr>        <dbl>  <dbl>   <dbl>   <dbl> <int>
##  1 American Ale  40.3 -106.   0.0639    54.0    22
##  2 American Ale  45.6 -122.   0.0469    32.8    20
##  3 American IPA  33.5 -118.   0.0719    75.7    17
##  4 American IPA  40.3 -106.   0.0796    70.1    15
##  5 American Ale  33.5 -118.   0.059     46.8    12
##  6 American Ale  47.6 -121.   0.0547    33.4    12
##  7 American IPA  28.2  -82.7  0.0707    66.3    12
##  8 American IPA  34.8 -119.   0.0745    71.1    11
##  9 American IPA  47.6 -121.   0.0593    59.5    11
## 10 Ale           39.6  -86.3  0.0724    25.8    10
## # … with 627 more rows
# Beer Style bubble map fixed
library(ggrepel)
usa <- map_data("state")
ggplot() +
  geom_polygon(data = usa, aes(x=long, y = lat, group = group),color = "white", fill="grey", alpha=0.3) +
  geom_point( data=bb, aes(x=long, y=lat, color=BeerStyle, size=beers,  alpha=0.5)) +
  scale_color_viridis(option="viridis", discrete=TRUE, name="Beer Style" ) +
  theme_void() +
  scale_size(range = c(1, 10), name=" # of beers")+
  ggtitle("Number of Beers by Style")

#animation of bubble graph using map and simplified beer styles
library(gganimate)
library(gifski)
p <- ggplot() +
  geom_polygon(data = usa, aes(x=long, y = lat, group = group),color = "white", fill="grey", alpha=0.3) +
  geom_point( data=bb, aes(x=long, y=lat, color=BeerStyle, size=beers,  alpha=0.5)) +
  scale_color_viridis_d(option="viridis", name="Beer Style" ) +
  theme_void() +
  scale_size(range = c(2, 20), name=" # of beers")+
  ggtitle("Number of Beers by Style\n {closest_state}")+
  theme(text = element_text(size=20))+
  transition_states(BeerStyle) +
  enter_fade() +
  enter_grow() +
  ease_aes('sine-in-out')
animate(p, width = 900, height = 572, duration = 14)

#boxplots by Beer Style
#boxplot by ABV/Style
gg<-ggplot(brewcomb_type, aes(x = BeerStyle, y = ABV, fill = BeerStyle)) +
  geom_boxplot() +
  theme_classic()
gg+coord_flip()+ scale_fill_viridis(option="magma", discrete=TRUE, name="Beer Style" )

#boxplot by IBU/Style
gg<-ggplot(brewcomb_type, aes(x = BeerStyle, y = IBU, fill = BeerStyle)) +
  geom_boxplot() +
  theme_classic()
gg+coord_flip()+ scale_fill_viridis(option="magma", discrete=TRUE, name="Beer Style" )

# knit together pngs into gif for powerpoint
library(magick)
## Linking to ImageMagick 6.9.12.3
## Enabled features: cairo, fontconfig, freetype, heic, lcms, pango, raw, rsvg, webp
## Disabled features: fftw, ghostscript, x11
list.files(path='//Users/macowner/Desktop/Data Science/gif//', pattern = '*.png', full.names = TRUE) %>%
  image_read() %>% # reads each path file
  image_join() %>% # joins image
  image_animate(fps=4) %>% # animates, can opt for number of loops
  image_write("FileName.gif") # write to current dir
## Warning in image_write(., "FileName.gif"): Writing image with 0 frames
#ipa mapping
ipa<-brewcomb_type%>% filter(BeerStyle=="American IPA") %>% group_by(BeerStyle, lat, long) %>% summarize(meanabv = mean(ABV), meanIBU = mean(IBU), beers = n()) %>% arrange(desc(beers))
## `summarise()` has grouped output by 'BeerStyle', 'lat'. You can override using the `.groups` argument.
ipa
## # A tibble: 149 × 6
## # Groups:   BeerStyle, lat [140]
##    BeerStyle      lat   long meanabv meanIBU beers
##    <chr>        <dbl>  <dbl>   <dbl>   <dbl> <int>
##  1 American IPA  33.5 -118.   0.0719    75.7    17
##  2 American IPA  40.3 -106.   0.0796    70.1    15
##  3 American IPA  28.2  -82.7  0.0707    66.3    12
##  4 American IPA  34.8 -119.   0.0745    71.1    11
##  5 American IPA  47.6 -121.   0.0593    59.5    11
##  6 American IPA  39.6  -86.3  0.0695    81.1     8
##  7 American IPA  37.7 -122.   0.0696    63.7     7
##  8 American IPA  40.6  -73.9  0.0729    76.9     7
##  9 American IPA  42.7  -71.9  0.058     55.4     7
## 10 American IPA  44.8  -93.3  0.0696    84       7
## # … with 139 more rows
ggplot() +
  geom_polygon(data = usa, aes(x=long, y = lat, group = group),color = "white", fill="grey", alpha=0.3) +
  geom_point( data=ipa, aes(x=long, y=lat, color=meanIBU, size=beers,  alpha=0.5)) +
  scale_color_viridis(option="viridis",  name="IBU" ) +
  theme_void() +
  scale_size(range = c(1, 10), name=" # of beers")+
  ggtitle("Number of American IPA Beers")

#American IPA was 27% of the beers analyzed, and has higher IBU and ABV than Budweiser.
#There may be an opportunity for Budweiser to develop an American IPA, which has manufacturing in a diverse # of geographic locations across the US.